dados_originais <- read.csv("spotify_data.csv")
aux <- sample(1:nrow(dados_originais), size=nrow(dados_originais)*0.005)
dados <- dados_originais[aux,-c(1:4)]
dados2 <- dados[dados$year==2022,]
wines <- dados2[,-c(2,3,6,8,16)]
#variaveis categoricas: c(2,3,6,8,16)
library(dplyr)
library(ggplot2)
library(GGally)
library(corrplot)
library(factoextra)
library(gridExtra)
library(plotly)
ggplotly(ggplot(data = dados2, aes(x = popularity)) +
geom_histogram(color = "black", fill = "steelblue") +
labs(x = 'Popularidade'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = year)) +
geom_bar(color = "black", fill = "steelblue") +
labs(x = 'Ano'))
ggplotly(ggplot(data = dados2, aes(x = danceability)) +
geom_histogram(color = "black", fill = "steelblue") +
labs(x = 'Dançabilidade',y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = energy)) +
geom_histogram(color = "black", fill = "steelblue") +
labs(x = 'Energia'),y="Frequencia")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = key)) +
geom_bar(color = "black", fill = "steelblue") +
scale_x_continuous(breaks=seq(0,11,by=1),labels = c("C","C#","D","D#","E","F","F#","G","G#","A","A#","B")) +
scale_y_continuous(breaks=seq(0,150000,by=25000),labels=c("0","25","50","75","100","125","150")) +
labs(x = 'Tom',y='Frequencia'))
ggplotly(ggplot(data = dados2, aes(x = loudness)) +
geom_histogram(color = "black", fill = "steelblue") +
scale_x_continuous(breaks=seq(15,60,by=5)) +
scale_y_continuous(breaks=seq(0,300000,by=50000),labels=c("0","50","100","150","200","250","300")) +
labs(x = 'Decibéis',y = 'Frequencia'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = mode)) +
geom_bar(color = "black", fill = "steelblue") +
scale_x_continuous(breaks = c(0,1),labels = c("Menor","Maior")) +
scale_y_continuous(breaks=seq(0,700000,by=100000),labels=c("0","100","200","300","400","500","600","700")) +
labs(x = 'Escala',y = 'Frequencia'))
ggplotly(ggplot(data = dados2, aes(x = speechiness)) +
geom_histogram(color = "black", fill = "steelblue") +
scale_y_continuous(breaks=seq(0,600000,by=100000),labels=c("0","100","200","300","400","500","600")) +
labs(x = 'Fala',y = 'Frequencia'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = acousticness)) +
geom_histogram(color = "black", fill = "steelblue") +
scale_y_continuous(breaks=seq(0,400000,by=100000),labels=c("0","100","200","300","400")) +
labs(x = 'Acusticidade',y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = instrumentalness)) +
geom_histogram(color = "black", fill = "steelblue") +
scale_y_continuous(breaks=seq(0,700000,by=100000),labels=c("0","100","200","300","400","500","600","700")) +
labs(x = 'Instrumentalidade',y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = liveness)) +
geom_histogram(color = "black", fill = "steelblue") +
scale_y_continuous(breaks=seq(0,300000,by=100000),labels=c("0","100","200","300")) +
labs(x="Presença de audiência",y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = valence)) +
geom_histogram(color = "black", fill = "steelblue") +
labs(x="Valência",y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = tempo)) +
geom_histogram(color = "black", fill = "steelblue") +
labs(x = 'Bpm',y='Frequencia'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = duration_ms)) +
geom_histogram(color = "black", fill = "steelblue") +
xlim(c(0,1e+06)) +
labs(x = 'Duração',y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(ggplot(data = dados2, aes(x = time_signature)) +
geom_bar(color = "black", fill = "steelblue") +
scale_x_continuous(breaks=seq(0,7,by=1)) +
labs(x = 'Compasso',y="Frequencia"))
corrplot(cor(dados[,-c(2,3,6,8,16)]))
res.dist <- get_dist(dados2[,-c(2,3,6,8,16)], method = "pearson") # Correlation-based distance method
fviz_dist(res.dist, lab_size = 8) # Visualize the dissimilarity matrix
res.dist2 <- get_dist(wines, method = "euclidian") # Correlation-based distance method
fviz_dist(res.dist2, lab_size = 8) # Visualize the dissimilarity matrix
res.dist3 <- get_dist(wines, method = "manhattan") # Correlation-based distance method
fviz_dist(res.dist3, lab_size = 8) # Visualize the dissimilarity matrix
res.dist4 <- get_dist(wines, method = "minkowski") # Correlation-based distance method
fviz_dist(res.dist4, lab_size = 8) # Visualize the dissimilarity matrix
#install.packages("d3heatmap")
library(d3heatmap)
##
## ======================
## Welcome to d3heatmap version 0.9.0
##
## Type citation('d3heatmap') for how to cite the package.
## Type ?d3heatmap for the main documentation.
##
## The github page is: https://github.com/talgalili/d3heatmap/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/d3heatmap/issues
## You may ask questions at stackoverflow, use the r and d3heatmap tags:
## https://stackoverflow.com/questions/tagged/d3heatmap
## ======================
##
## Attaching package: 'd3heatmap'
## The following objects are masked from 'package:base':
##
## print, save
d3heatmap(scale(wines), colors = "RdYlBu")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colors
## Warning: Some values were outside the color scale and will be treated as NA
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colors
hc <- eclust(wines, "hclust", hc_metric = "euclidian", hc_method = "ward.D", k=100)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_dend(hc, rect = TRUE)
fviz_dend(hclust(dist(wines)))
# normalização das variáveis
dadosNorm <- as.data.frame(scale(dados2[,-c(2,3,6,8,16)]))
set.seed(1234)
dados_k2 <- kmeans(dadosNorm, centers = 2)
dados_k2$size
## [1] 49 214
dados_k2$size/nrow(dados)
## [1] 0.00845119 0.03690928
O primeiro cluster tem 4477 (0.77%) observações, enquanto que o segundo tem 1321 (0.23%).
aggregate(dados2[,-c(2,3,6,8,16)], by=list(dados_k2$cluster), mean)
## Group.1 popularity danceability energy loudness speechiness acousticness
## 1 1 28.95918 0.3948367 0.2740090 -19.388000 0.05287551 0.7914496
## 2 2 31.40654 0.5742710 0.7610327 -6.389164 0.10392243 0.1981805
## instrumentalness liveness valence tempo duration_ms
## 1 0.6218577 0.2226163 0.2127294 100.4153 176786.1
## 2 0.1654614 0.2384033 0.4703551 128.0888 217644.9
# quantos clusters?
bss <- numeric()
wss <- numeric()
for(i in 1:10){
# For each k, calculate betweenss and tot.withinss
bss[i] <- kmeans(dadosNorm, centers=i)$betweenss
wss[i] <- kmeans(dadosNorm, centers=i)$tot.withinss
}
# Between-cluster sum of squares vs Choice of k
p3 <- qplot(1:10, bss, geom=c("point", "line"),
xlab="Number of clusters", ylab="Between-cluster sum of squares") +
scale_x_continuous(breaks=seq(0, 10, 1)) +
theme_bw()
# Total within-cluster sum of squares vs Choice of k
p4 <- qplot(1:10, wss, geom=c("point", "line"),
xlab="Number of clusters", ylab="Total within-cluster sum of squares") +
scale_x_continuous(breaks=seq(0, 10, 1)) +
theme_bw()
# Subplot
grid.arrange(p3, p4, ncol=2)
A partir das somas de quadrados, 2 ou 3 clusters seriam ideais. O ganho
da inclusao de mais clusters e em geral similar, com excecao a diferença
entre 6 e 7 clusters.